Data setup

Read in source data.

library(tidyverse)
library(mrcommons)
library(mrdrivers)
library(magrittr)

# reed in source data
foodEmi <- readSource("EDGARfood", subtype="foodSystemSector")
foodBal <- calcOutput("FAOmassbalance", aggregate = F)
foodBal <- time_interpolate(foodBal, seq(1990,2018) )

Process source data

# aggregate food products to kcr kli kds k
sets <- read.csv(system.file("extdata", "magpiesets.csv", package = "magpiesets"))
sets %>% select(all,kcr,kli,ksd,k) -> mapping
foodBal %>% toolAggregate(rel= slice(mapping, 1:44),from="all",to="kcr+kli+ksd+k" , 
                          dim = 3.1, partrel  =T) %>%
        .[,,"",invert=T] -> foodBal_p

# subset the data that will be used for the emission factors
foodBal_p <- foodBal_p[,,c("wm","dm","nr")][,,c("production","food", "milling" ,
                                                "refining", "extracting","fermentation",  
                                                "distilling", "waste", "domestic_supply", 
                                                "households")]

# aggregate the processing items to processing cateory
map2 <- tibble(source= c("production","food", "milling" ,"refining",
                         "extracting","fermentation",  "distilling", 
                         "waste", "domestic_supply", "households")  , 
               tar = c("production","processing", "processing" ,
                       "processing", "processing","processing",  
                       "processing", "waste","domestic_supply", 
                       "households"))

foodBal_p2<-  toolAggregate(foodBal_p, map2, from="source", to="tar",dim=3.2)


# pivot the objects to wide format
pivot_wider(as.data.frame( foodBal_p2), id_cols = c("Year", "Region"), names_from = starts_with("Data") ,  values_from = "Value") ->datBal

pivot_wider(as.data.frame( foodEmi), id_cols = c("Year", "Region"), names_from = starts_with("Data") ,  values_from = "Value") ->datEmi

data<- left_join(datEmi, datBal)

Add possible drivers/predictors to the data.

##########  add drivers to the data

# add finer resolved processing items
# pivot_wider(as.data.frame( foodBal_p), id_cols = c("Year", "Region"), names_from = starts_with("Data") ,  values_from = "Value") ->datBal_fine
# data <- left_join(data,datBal_fine)
 

### add extra infromation from foodBal as drivers

# add finer resolved processing and production items
foodBal[,,c("food","milling","refining","extracting"  ,"distilling") ][,,"dm"] %>% dimSums(.,dim=3.2) %>%  collapseDim() %>% as.data.frame() %>%  
  pivot_wider(names_from = Data1, values_from = "Value") -> driv_procc
foodBal[,,c("production") ][,,"dm"]  %>%  collapseDim() %>% as.data.frame() %>%  
  pivot_wider(names_from = Data1, values_from = "Value") -> driv_prod
driv_procc %>% rename_with( .cols = !(1:3) , .fn =  ~ paste0("driv_proc_", .x) ) -> driv_procc
driv_prod %>% rename_with( .cols = !(1:3) , .fn =  ~ paste0("driv_prod_", .x) ) -> driv_prod 
data<- left_join(data, driv_procc )
data<- left_join(data, driv_prod )


#add export as driver
exp <- dimSums(foodBal[,,"export"][,,"wm"], dim=3)
data<- left_join( data, select(as.data.frame( exp) , Year, Region, Value), 
                  by=c("Region","Year")) %>%  rename(driv_exp = Value)


### add drivers from other data sources

#read data
gdp <- calcOutput("GDPPast", aggregate = F)
pop <- calcOutput("PopulationPast", aggregate = F)
affl <- collapseDim(gdp)/collapseDim(pop)
urb <- calcOutput("UrbanPast", aggregate = F)
dS <- calcOutput("DevelopmentState", aggregate = F) # needs to be interpolated
dS <- time_interpolate(dS, 1990:2018)
govI <- calcOutput("GovernanceIndicator", aggregate = F) # only from 1995
govI <- time_interpolate(govI, 1990:2018)

#add data
data<- left_join( data, select(as.data.frame( affl) , Year, Region, Value), 
                  by=c("Region","Year")) %>%  rename(driv_aff = Value)
data<- left_join( data, select(as.data.frame( pop) , Year, Region, Value), 
                  by=c("Region","Year")) %>%  rename(driv_pop = Value)
data<- left_join( data, select(as.data.frame( gdp) , Year, Region, Value), 
                  by=c("Region","Year")) %>%  rename(driv_gdp = Value)
data<- left_join( data, select(as.data.frame( urb) , Year, Region, Value), 
                  by=c("Region","Year")) %>%  rename(driv_urb = Value)
data<- left_join( data, select(as.data.frame( govI[,,"SSP1"]) , Year, Region, Value), 
                  by=c("Region","Year")) %>%  rename(driv_govI = Value)
data<- left_join( data, select(as.data.frame( dS[,,"SSP1"]) , Year, Region, Value), 
                  by=c("Region","Year")) %>%  rename(driv_dS = Value)



## add some infromative data about regions and country names
regmap        <- toolGetMapping("regionmappingH12.csv")
data<- left_join(data, regmap %>% rename(Region=CountryCode) ,by="Region") %>%  
  rename( info_countrynames = X, driv_largReg  = RegionCode)



## remove some NA or zero rows 

#sort out country that are completely zero in fooBal
data %>% group_by(Region) %>% summarise(across(starts_with("k"), sum)) %>% rowwise %>%  
  mutate(sum= sum(across(starts_with("k") )) ) %>% filter(sum==0) %>%  pull(Region) -> sortout # zeros
data %<>% filter(! Region %in% sortout)

#remove emission columns that are completely zero
data %<>% select(!(contains("F-gases" )&!contains("Packaging")&!contains("Retail") )) 

#remove households_dm which is a completly zero data column
# data %>% select(contains("households_dm"))  %>% 
#   summarise_each(function(x)sum(x, na.rm = T)) %>% t()
data %<>% select(!(contains("households_dm") )) 

# remove vestigous cell columns
data %<>% select(-Cell)

Choose denominators of emission factors

Emission factors assume a linear relationship between the emissions from a given source and a related (economic) activity. While the linear relation does not need to be the same for all countries and times, a good starting point to choose which emission to link to which activity might still be the global correlation coefficients.

### corellation plots
data %>% select(!Year&!Region &!starts_with("driv_")&!starts_with("inf") )-> datac

datac %>% drop_na() %>% cor() %>% 
  .[1:26,27:ncol(datac)] %>% t -> tem  

tem[order(str_extract(row.names(tem), "_.*_") ) , ] %>%  corrplot::corrplot( col.lim = c(-0.1,1), method = "number")

The correlation structure is less clear than one could hope for. For the production emissions it seems common-sensical to choose a production quantity as a denominator. “k_production_dm” seems to be a good first idea due to the high correlations.
For the processing emissions there is no single item with high correlations. “k_processing_dm” is chosen for N2O , “kli_processing_dm” for CO2 and “kcr_processing_wm” for CH4.

Regression analysis

As said the emission factors can vary for different times and regions. We are interested in finding the underlying drivers in order to predict emissions for different economic scenarios.

A simple linear regression model for the emission factors is

\[ emission_{g,c,y}/ m_{c,y} =a+b_{1}* d_{1,c,y}+b_2* d_{2,c,y} + b_3* d_{3,c,y}+ \cdots + \eta \] with \[\eta \sim N(0,\epsilon )\] g is the GHG, c is the country, y is the year, a is the intercept, \(b_i\) are the slopes, \(d_{i,c,y}\) are the drivers in the data and \(\eta\) is the normally distributed error.

Since we are interested in predicting the emissions and not the emission factor in the end it makes sense to work with switch the model a bit by bringing the factor \(m_{c,y}\) to the RHS. The least squared estimates will minimize residual squares for the emissions and not the emission factors then.

\[ emission_{g,c,y} =a* m_{c,y}+ b_1* d_{1,c,y}* m_{c,y}+b_2* d_{2,c,y}*m_{c,y} + b_3* d_{3,c,y}*m_{c,y}+ \cdots + \gamma \] with \[\gamma \sim N(0,\delta )\]

It should be mentioned that a standard assumption of this regression model is violated in our case: The observations \(emission_{g,c,y}\) \(m_{c,y}\) \(d_{i,c,y}\) for different values of c and y are not independent. Hence the results, especially the significance of influence of drivers, need to be treated with caution. A way of avoiding this assumption, would be the use of mixed effect models. Still to get a first idea of the drivers the standard model is used.

The following linear models where builded by successive elimination of independent variables. Further olsrr a tool for automatic selection of independent variables was used.

Production emissions

The following function is helful for deciding which variable to eliminate.

elimHelper <- function(lR) {
  
  f=NULL
  cN <- colnames(lR)
  for (n in cN[3:length(cN)]){
    lR2 <- lR %>% select( -n)
    model2 <- lm( dep ~ k_production_nr +.:k_production_nr+0 , data = lR2 )
    r2 <- summary(model2)$r.squared
    add <- setNames( r2,n)
    f <- c(f, add)
  }
  model <- lm( dep ~ k_production_nr +.:k_production_nr+0 , data = lR)
  r<-summary(model)$coefficients[,4]  
  print("p-value")
  print(sort(r))
  print("r-change")
  print(summary(model)$r.squared)
  print(sort(f))
  
}

N2O

Build the model and print out some statistics.

#Canada, Australia and New Zealand: CAZ; China: CHA; European Union: EUR; India: IND; Japan: JPN; Latin America: LAM; Middle East and North Africa: MEA; non-EU member states: NEU; other Asia: OAS; reforming countries: REF; sub-Saharan Africa: SSA; United States: USA

#### N2O production emissions
lR <- data %>% mutate(dep=GWP_100_N2O_Production)-> q

q %>% select(dep,k_production_nr,driv_prod_oilpalm,driv_prod_pasture,driv_prod_res_nonfibrous, driv_pop, driv_dS, driv_aff)->lR
  
  
# get nonzero processing items
data %>% select(starts_with("driv_pr")) %>% summarise(across(where(is.numeric), sum )) %>% unlist() -> dd
dd[dd!=0] %>% names() -> nonzero_driv

# model <- lm( dep ~ k_production_nr + (driv_prod_oilpalm):k_production_nr +
#                (driv_prod_oilpalm):k_production_nr+
#                (driv_prod_pasture):k_production_nr+
#                (driv_prod_res_nonfibrous):k_production_nr+
#                driv_pop+
#                driv_aff+
#                driv_rice:k_production_nr +0 , data = lR )



model2 <- lm( dep ~ k_production_nr +k_production_nr:driv_prod_pasture+ k_production_nr:driv_prod_res_nonfibrous  +0 , data = lR )

#summary(model)
#lR %>% colnames() %>% dput

#k<- olsrr::ols_step_all_possible(model)

summary(model2)
## 
## Call:
## lm(formula = dep ~ k_production_nr + k_production_nr:driv_prod_pasture + 
##     k_production_nr:driv_prod_res_nonfibrous + 0, data = lR)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -37111   -418    -12    338  37317 
## 
## Coefficients:
##                                            Estimate Std. Error t value Pr(>|t|)
## k_production_nr                           1.101e+04  3.612e+01  304.91   <2e-16
## k_production_nr:driv_prod_pasture        -5.998e+00  9.351e-02  -64.15   <2e-16
## k_production_nr:driv_prod_res_nonfibrous  1.673e+02  2.241e+00   74.64   <2e-16
##                                             
## k_production_nr                          ***
## k_production_nr:driv_prod_pasture        ***
## k_production_nr:driv_prod_res_nonfibrous ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3692 on 5420 degrees of freedom
##   (116 Beobachtungen als fehlend gelöscht)
## Multiple R-squared:  0.9845, Adjusted R-squared:  0.9845 
## F-statistic: 1.147e+05 on 3 and 5420 DF,  p-value: < 2.2e-16
### driver: rice cultivation

### literature : rice cultivation, nitrogen fertilisation (manure, sythetic N)

Plot predictions vs datapoints.

p<-ggplot(
  data #%>% filter(Year=="2000" )
                      , aes(x=1.099e+04*k_production_nr 
                            -5.977e+00 * k_production_nr*driv_prod_pasture
                            + k_production_nr*driv_prod_res_nonfibrous  *1.676e+02, 
                            y= GWP_100_N2O_Production, label = Region, Year= Year , 
                            #color = driv_prod_rice_pro/(k_production_dm), 
                            country= info_countrynames
                            ))+ 
  geom_point()+ 
  geom_smooth(method='lm', formula= y~x, aes(group=1))

plotly::ggplotly(p, tooltip = c("y", "Year", "country"))

CO2

#Canada, Australia and New Zealand: CAZ; China: CHA; European Union: EUR; India: IND; Japan: JPN; Latin America: LAM; Middle East and North Africa: MEA; non-EU member states: NEU; other Asia: OAS; reforming countries: REF; sub-Saharan Africa: SSA; United States: USA

#### CO2 production emissions, k_households_nr works better as denominator, why?? 


lR <- data %>% mutate(dep=GWP_100_CO2_Production)-> q

q %>% select(dep,k_production_nr,starts_with("driv_prod"),  driv_aff, driv_dS, driv_exp, driv_govI, driv_pop, k_households_nr)->lRPre

lRPre %>% select(-c(driv_prod_ethanol, driv_prod_oilpalm,driv_prod_oils,driv_prod_others , 
                 driv_prod_sunflower,driv_prod_foddr, driv_prod_betr,driv_prod_begr,
                 driv_prod_scp, driv_prod_cottn_pro, driv_prod_trce, driv_prod_tece,
                 driv_prod_livst_egg, driv_prod_res_nonfibrous, driv_prod_woodfuel,driv_prod_alcohol,
                 driv_prod_brans, driv_exp,driv_prod_res_cereals,driv_prod_res_fibrous,
                 driv_prod_livst_rum, driv_prod_soybean, driv_prod_oilcakes,
                 driv_prod_molasses, driv_prod_livst_chick ,driv_prod_sugar,
                 driv_prod_cassav_sp, driv_prod_fibres, driv_prod_rapeseed, driv_prod_livst_pig,
                 driv_aff, driv_prod_fish, driv_prod_sugr_beet, driv_prod_sugr_cane,
                 driv_prod_puls_pro, driv_govI,driv_dS, driv_prod_rice_pro, 
                 )) -> lR

#elimHelper(lR)
#model <- lm( dep ~ k_production_nr +.:k_production_nr+0 , data = lR )

model2 <- lm( dep ~ k_production_nr+ k_production_nr:driv_prod_pasture+ k_production_nr:driv_pop+0 , data = lR )

#model3 <- lm( dep ~ k_households_nr+ k_households_nr:. +0 , data = lRPre )

summary(model2)
## 
## Call:
## lm(formula = dep ~ k_production_nr + k_production_nr:driv_prod_pasture + 
##     k_production_nr:driv_pop + 0, data = lR)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -35344   -566    -13    533  83912 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## k_production_nr                   4385.55214   55.78662   78.61   <2e-16 ***
## k_production_nr:driv_prod_pasture   -4.18502    0.13878  -30.16   <2e-16 ***
## k_production_nr:driv_pop             5.06529    0.07384   68.60   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5716 on 5420 degrees of freedom
##   (116 Beobachtungen als fehlend gelöscht)
## Multiple R-squared:  0.9176, Adjusted R-squared:  0.9176 
## F-statistic: 2.013e+04 on 3 and 5420 DF,  p-value: < 2.2e-16
#k<-olsrr::ols_step_all_possible(model3)

### drivers: no clear driver visible, in SSA urbanization makes a difference

### literature: mechanization of agriculture
p<-ggplot(
  data #%>% filter(Year=="2000" )
                      , aes(x=4384.08708*k_production_nr 
                            -4.18417 *k_production_nr*driv_prod_pasture +
                              k_production_nr*driv_pop  *    5.06605
                            , y= GWP_100_CO2_Production, label = Region, 
                            Year= Year, 
                            country = info_countrynames # , 
                            ))+ 
  geom_point()+ 
  geom_smooth(method='lm', formula= y~x, aes(group=1))

plotly::ggplotly(p, tooltip = c("y", "Year", "country"))

CH4

#Canada, Australia and New Zealand: CAZ; China: CHA; European Union: EUR; India: IND; Japan: JPN; Latin America: LAM; Middle East and North Africa: MEA; non-EU member states: NEU; other Asia: OAS; reforming countries: REF; sub-Saharan Africa: SSA; United States: USA

#### CH4 production emissions
dataM <- select(data,-starts_with("info")) 

dataM %>% mutate(dep=GWP_100_CH4_Production) %>% select(dep,k_production_nr,starts_with("driv_prod"),  driv_aff, driv_dS, driv_exp, driv_govI, driv_pop)->lRPre

# these variables where eliminated since they did not contribute to the dependent variable
lRPre %>% select(-c(driv_prod_res_fibrous,driv_prod_oilpalm,  driv_prod_rapeseed,driv_prod_res_nonfibrous, 
                    driv_prod_ethanol, driv_prod_livst_chick, driv_aff , driv_prod_puls_pro , driv_prod_scp , driv_prod_betr, driv_prod_betr , driv_prod_begr  , driv_prod_wood , driv_prod_tece, driv_prod_fish, driv_dS     , driv_prod_foddr , driv_prod_others , driv_prod_sugr_beet, driv_prod_sugar     , driv_prod_livst_milk   , driv_prod_alcohol  , driv_prod_molasses, driv_exp , driv_prod_distillers_grain , driv_prod_maiz, driv_prod_potato , driv_prod_cassav_sp  , driv_prod_oils , driv_prod_oilcakes , driv_prod_sunflower, driv_prod_cottn_pro, driv_prod_groundnut  , driv_prod_livst_egg , driv_prod_fibres, driv_prod_woodfuel  , driv_prod_trce    )) -> lR 

#elimHelper(lR)
#model <- lm( dep ~ k_production_nr + .:k_production_nr+0 , data = lR)
#k<-olsrr::ols_step_all_possible(model)

# resulting model
model <- lm( dep ~ k_production_nr +
               k_production_nr:driv_prod_brans +
               k_production_nr:driv_prod_res_cereals +0 , data = lR)

summary(model)
## 
## Call:
## lm(formula = dep ~ k_production_nr + k_production_nr:driv_prod_brans + 
##     k_production_nr:driv_prod_res_cereals + 0, data = lR)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -97295   -636      5   1105  91959 
## 
## Coefficients:
##                                        Estimate Std. Error t value Pr(>|t|)    
## k_production_nr                       24641.624    105.671   233.2   <2e-16 ***
## k_production_nr:driv_prod_brans       -1255.008     12.312  -101.9   <2e-16 ***
## k_production_nr:driv_prod_res_cereals   179.961      1.505   119.5   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12290 on 5420 degrees of freedom
##   (116 Beobachtungen als fehlend gelöscht)
## Multiple R-squared:  0.9723, Adjusted R-squared:  0.9723 
## F-statistic: 6.348e+04 on 3 and 5420 DF,  p-value: < 2.2e-16
### old model
# p<- ggplot(
#   data #%>% filter( Year=="2000")
#                       , aes(x=k_production_nr, y= GWP_100_CH4_Production, label = Region, Year= Year  ))+ 
#   geom_point(aes(color = ((driv_rice+driv_milk*2)/driv_aff)^0.6 ) )+ #geom_text(aes(color = 
#   geom_smooth(method='lm', formula= y~x, aes(group=1))
# 
# plotly::ggplotly(p)

### drivers: rice, milk prodcutin, different world regions

###: literature: rice, enteric fermatation
p<- ggplot(
  data #%>% filter( Year=="2000")
                      , aes(x= k_production_nr                  *         24571.241+
                              k_production_nr*driv_prod_brans    *   -1251.170 + 
                              k_production_nr*driv_prod_res_cereals *  179.799
                            , y= GWP_100_CH4_Production, 
                            label = Region, 
                            Year= Year, 
                            rum= driv_prod_livst_rum, 
                            country = info_countrynames ))+ 
  geom_point()+
  geom_smooth(method='lm', formula= y~x, aes(group=1))

plotly::ggplotly(p, tooltip = c("y", "Year", "country"))

Processing

elimHelper_proc <- function(lR) {
  
  f=NULL
  cN <- colnames(lR)
  for (n in cN[3:length(cN)]){
    lR2 <- lR %>% select( -n)
    model2 <- lm( dep ~ kcr_processing_wm +.:kcr_processing_wm+0 , data = lR2 )
    r2 <- summary(model2)$r.squared
    add <- setNames( r2,n)
    f <- c(f, add)
  }
  model <- lm( dep ~ kcr_processing_wm +.:kcr_processing_wm+0 , data = lR)
  r<-summary(model)$coefficients[,4]  
  print("p-value")
  print(sort(r))
  print("r-change")
  print(summary(model)$r.squared)
  print(sort(f))
  
}

N2O

#Canada, Australia and New Zealand: CAZ; China: CHA; European Union: EUR; India: IND; Japan: JPN; Latin America: LAM; Middle East and North Africa: MEA; non-EU member states: NEU; other Asia: OAS; reforming countries: REF; sub-Saharan Africa: SSA; United States: USA

#### N2O processing emissions
# best denominator: k_processing_dm , alternatively kli_processing_dm
# data$driv_rice

lR <- data %>% mutate(dep=GWP_100_N2O_Processing)%>% select(dep,k_processing_dm ,starts_with("driv_"), k_processing_nr, kcr_processing_nr)

#model <- lm( dep ~ k_processing_dm+  .:k_processing_dm  +0, data = lR )

#summary(model)

model2 <- lm( dep ~k_processing_dm    + k_processing_dm:driv_pop + k_processing_dm:k_processing_nr+ k_processing_dm:kcr_processing_nr+0, data=lR)

summary(model2)
## 
## Call:
## lm(formula = dep ~ k_processing_dm + k_processing_dm:driv_pop + 
##     k_processing_dm:k_processing_nr + k_processing_dm:kcr_processing_nr + 
##     0, data = lR)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -499.15   -1.21    0.55    6.59  280.71 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## k_processing_dm                    7.663e+00  3.151e-02  243.18   <2e-16 ***
## k_processing_dm:driv_pop           3.321e-03  3.708e-05   89.54   <2e-16 ***
## k_processing_dm:k_processing_nr    2.231e+00  2.303e-02   96.88   <2e-16 ***
## k_processing_dm:kcr_processing_nr -3.308e+00  3.443e-02  -96.08   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 42.24 on 5419 degrees of freedom
##   (116 Beobachtungen als fehlend gelöscht)
## Multiple R-squared:  0.9903, Adjusted R-squared:  0.9903 
## F-statistic: 1.382e+05 on 4 and 5419 DF,  p-value: < 2.2e-16
#k<- olsrr::ols_step_all_possible(model)


#view(k)

# p<- ggplot(
#   data #%>% filter(Year=="2000" )
#                       , aes(x=7.6618298    *k_processing_dm, y= GWP_100_N2O_Processing, Region=Region
#                             ))+ 
#   geom_point(
#     aes( ) 
#     )+ 
#   #geom_text(aes(label = Region))+
#   geom_smooth(method='lm', formula= y~x, aes(group=1))# befor regression


### driver_for_kli_processing_dm: driv_pop, k_processing_nr, kcr_processing_nr; Or: brans and oilpalm

# maybe its about what products are procesed? k_processing_nr k_processing_dm interaction,
# more part of crop processing seems to result in less NO2 emissions. Higher pop results in higher NO2 emissions
p<- ggplot(
  data #%>% filter(Year=="2000" )
                      , aes(x=7.6618298    *k_processing_dm + 
                              k_processing_dm*driv_pop          * 0.0033206   + 
                              k_processing_dm*k_processing_nr   *2.2303566  + 
                              k_processing_dm*kcr_processing_nr* -3.3071144 
                              , y= GWP_100_N2O_Processing
                              , label = Region, Year= Year, country= info_countrynames #, 
                            ))+ 
  geom_point()+ 
  geom_smooth(method='lm', formula= y~x, aes(group=1))

plotly::ggplotly(p, tooltip = c("y", "Year", "country"))

CO2

#Canada, Australia and New Zealand: CAZ; China: CHA; European Union: EUR; India: IND; Japan: JPN; Latin America: LAM; Middle East and North Africa: MEA; non-EU member states: NEU; other Asia: OAS; reforming countries: REF; sub-Saharan Africa: SSA; United States: USA

#### CO2 processing emissions
 #data$driv_exp


data %>% mutate(dep= GWP_100_CO2_Processing) %>% select(dep, kli_processing_nr ,driv_proc_brans,  driv_proc_sugr_beet  ) -> lR

model <- lm( dep ~ kli_processing_nr+ driv_proc_brans:kli_processing_nr + kli_processing_nr:driv_proc_sugr_beet+ +0, data = lR )

summary(model)
## 
## Call:
## lm(formula = dep ~ kli_processing_nr + driv_proc_brans:kli_processing_nr + 
##     kli_processing_nr:driv_proc_sugr_beet + +0, data = lR)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -36724    -76     -3    218  42737 
## 
## Coefficients:
##                                       Estimate Std. Error t value Pr(>|t|)    
## kli_processing_nr                     14616.68     407.25   35.89   <2e-16 ***
## kli_processing_nr:driv_proc_brans      1651.53      19.31   85.55   <2e-16 ***
## kli_processing_nr:driv_proc_sugr_beet  5739.90      75.15   76.38   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2421 on 5420 degrees of freedom
##   (116 Beobachtungen als fehlend gelöscht)
## Multiple R-squared:  0.9534, Adjusted R-squared:  0.9533 
## F-statistic: 3.693e+04 on 3 and 5420 DF,  p-value: < 2.2e-16
#k<- olsrr::ols_step_all_possible(model)

### driver: sugr and brans processing, taking squroot improves slitly the r^2
p2<- ggplot(
  data #%>% filter(Region %in% c("BRA", "IDN", "IND") )
                      , aes(  x= kli_processing_nr *   14614.71 +
                                kli_processing_nr*driv_proc_brans *1651.60 +
                                kli_processing_nr*driv_proc_sugr_beet  *5740.25    , 
                              y= GWP_100_CO2_Processing, label = Region, Year= Year   , 
                              country = info_countrynames
                            ))+ 
  geom_point()+ 
  geom_smooth(method='lm', formula= y~x, aes(group=1))

plotly::ggplotly(p2, tooltip = c("y", "Year", "country"))

CH4

For this emission a slightly different method was tried. The model is fitted to a random subset of the data and then tested on the other part.

#### CH4 processing emissions

# main driver: kcr_processing_wm

set.seed(1001)

data %>% sample_n(2000) -> data_fit # choose model fitting sample

data_test <- data %>% anti_join(data_fit)

lR <- data_fit %>% mutate(dep=GWP_100_CH4_Processing)%>% select(dep,kcr_processing_wm, k_processing_wm, kli_processing_wm, driv_aff, driv_pop, driv_gdp, driv_dS, driv_urb, k_processing_nr, starts_with("driv_Proc") )


lR_f <- lR %>% select(-c( driv_proc_sugar , driv_aff , driv_urb, 
                         driv_proc_livst_chick, driv_proc_groundnut, driv_proc_potato, driv_proc_puls_pro , driv_proc_tece, driv_proc_cassav_sp , driv_proc_maiz, driv_proc_oilpalm , driv_proc_rapeseed,
                         driv_proc_alcohol , driv_proc_molasses  , driv_proc_woodfuel ,driv_proc_wood , driv_proc_scp , driv_proc_res_nonfibrous , driv_proc_res_fibrous , driv_proc_res_cereals , driv_proc_pasture , driv_proc_betr , driv_proc_begr , driv_proc_foddr, driv_proc_fibres  , driv_proc_oilcakes , driv_proc_ethanol , driv_proc_distillers_grain , driv_proc_fish , driv_proc_livst_rum , driv_proc_livst_pig ,driv_gdp, driv_proc_cottn_pro , driv_proc_sunflower, driv_proc_trce , driv_proc_livst_egg , driv_proc_rice_pro , driv_proc_oils , driv_proc_brans 
                         ))


#elimHelper_proc(lR_f)


model <- lm( dep ~ kcr_processing_wm +kcr_processing_wm:driv_pop+ kcr_processing_wm:driv_proc_others+0 , data = lR_f)

summary(model)
## 
## Call:
## lm(formula = dep ~ kcr_processing_wm + kcr_processing_wm:driv_pop + 
##     kcr_processing_wm:driv_proc_others + 0, data = lR_f)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8655.4  -213.2   -41.0     3.2  8231.5 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## kcr_processing_wm                  37.7156677  0.3359690  112.26   <2e-16 ***
## kcr_processing_wm:driv_pop         -0.0289276  0.0005299  -54.59   <2e-16 ***
## kcr_processing_wm:driv_proc_others  0.2461490  0.0080560   30.55   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 837.9 on 1963 degrees of freedom
##   (34 Beobachtungen als fehlend gelöscht)
## Multiple R-squared:  0.9006, Adjusted R-squared:  0.9004 
## F-statistic:  5927 on 3 and 1963 DF,  p-value: < 2.2e-16
#k<-olsrr::ols_step_all_possible(model)

# literature: wastewater

Fitting data.

p2<- ggplot(
  data_fit #%>% filter(Region %in% c("BRA", "IDN", "IND") )
                      , aes(  x= (kcr_processing_wm              *        37.4412095 + 
                                kcr_processing_wm*driv_pop      *       -0.0289276  + 
                                kcr_processing_wm*driv_proc_others * 0.2461490), 
                              y= GWP_100_CH4_Processing, label = Region, Year= Year  , 
                              country = info_countrynames 
                            ))+ 
  geom_point()+ 
  geom_smooth(method='lm', formula= y~x, aes(group=1))
plotly::ggplotly(p2, tooltip = c("y", "Year", "country"))

Test data.

p2<- ggplot(
  data_test #%>% filter(Region %in% c("BRA", "IDN", "IND") )
                      , aes(  x= kcr_processing_wm              *        37.4412095 + 
                                kcr_processing_wm*driv_pop      *       -0.0289276  + 
                                kcr_processing_wm*driv_proc_others * 0.2461490, 
                              y= GWP_100_CH4_Processing, label = Region, Year= Year ,
                              country = info_countrynames 
                            #color = -4.212e+02 * driv_dS + driv_aff   *  5.742e-03 + driv_govI  *  1.019e+03
                            ))+ 
  geom_point(
    aes( ) 
    )+ 
  #geom_text(aes(label = Region))+
  geom_smooth(method='lm', formula= y~x, aes(group=1)) 

plotly::ggplotly(p2, tooltip = c("y", "Year", "country"))